Attempting to use tidymodels for processing our IBM krill data and
selecting variables of importance for each swimming parameter
library(tidymodels) # for the parsnip package, along with the rest of tidymodels
# Helper packages
library(readr) # for importing data
library(broom.mixed) # for converting bayesian models to tidy tibbles
library(dotwhisker) # for visualizing regression results
library(vip) # for variable importance plots
rm(list=ls(all=TRUE)) ## removes the previous workspace and environment so that we only have the data we need loaded in the session
load("~/Post-doc/krill-tank-code/DataProcessing/Notebooks/ParameterizationModel.Rdata")
dim(parameters)
glimpse(parameters)
dim(conditions)
glimpse(conditions)
df <- data.frame(conditions[,1],conditions[,2],conditions[,3],conditions[,4],parameters[,1], parameters[,7])
colnames(df) <- c("flow","chl", "guano", "light","ave.v", "dip.test")
df
df<-na.omit(df)
ave.v ~ flow * chl * guano * light
linear_reg()
linear_reg() %>%
set_engine("keras")
lm_mod <- linear_reg()
lm_fit <-
lm_mod %>%
fit(ave.v ~ flow * chl * guano * light, data = df) ## creates a linear model (LM)
lm_fit
tidy(lm_fit) ## summary of model
tidy(lm_fit) %>%
dwplot(dot_args = list(size = 2, color = "black"),
whisker_args = list(color = "black"),
vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))
new_points <- expand.grid(flow = 6.1,
chl = 19,
guano = 0,
light = 0) ## create new points
new_points
mean_pred <- predict(lm_fit, new_data = new_points) ## mean predicted body width
mean_pred
conf_int_pred <- predict(lm_fit,
new_data = new_points,
type = "conf_int") ## confidence interval prediction
conf_int_pred
plot_data <-
new_points %>%
bind_cols(mean_pred) %>%
bind_cols(conf_int_pred)
# and plot:
ggplot(plot_data, aes(x = flow)) +
geom_point(aes(y = .pred)) +
geom_errorbar(aes(ymin = .pred_lower,
ymax = .pred_upper),
width = .2) +
labs(y = "ave v")
# set the prior distribution
prior_dist <- rstanarm::student_t(df = 1)
set.seed(123)
# make the parsnip model
bayes_mod <-
linear_reg() %>%
set_engine("stan",
prior_intercept = prior_dist,
prior = prior_dist)
# train the model
bayes_fit <-
bayes_mod %>%
fit(ave.v ~ flow * chl * guano * light, data = df)
print(bayes_fit, digits = 5)
tidy(bayes_fit, conf.int = TRUE)
bayes_plot_data <-
new_points %>%
bind_cols(predict(bayes_fit, new_data = new_points)) %>%
bind_cols(predict(bayes_fit, new_data = new_points, type = "conf_int"))
ggplot(bayes_plot_data, aes(x = flow)) +
geom_point(aes(y = .pred)) +
geom_errorbar(aes(ymin = .pred_lower, ymax = .pred_upper), width = .2) +
labs(y = "ave.v") +
ggtitle("Bayesian model with t(1) prior distribution")
### Tuning the model in sections
df %>%
group_by(light) %>%
summarize(med_flow = median(flow))
bayes_mod %>%
fit(ave.v ~ flow * chl * guano * light, data = df)
ggplot(df,
aes(flow, ave.v)) + # returns a ggplot object
geom_jitter() + # same
geom_smooth(method = lm, se = FALSE) + # same
labs(x = "flow", y = "ave.v") # etc
Inputs
rf_mod <- ## creates random forest model
rand_forest(trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
set.seed(234) ## comment out to get random runs
rf_fit <- ## fits random forest model to whole dataset
rf_mod %>%
fit(ave.v ~ flow * chl * guano * light, data = df)
rf_fit
rf_pred <- predict(rf_fit, df)
plot(df$ave.v, rf_pred$.pred, main = "corr - 0.7294325") ## observed vs predicted
cor <- cor.test(df$ave.v, rf_pred$.pred) ## gives correlation coef
rf_split <- initial_split(df %>% select(flow, chl, guano, light, ave.v), ## splits cases base on initial results
strata = NULL) ## with strata = NULL splits 11/31, rf_test has all guano absent, 50/50 light split, uneven flow split (0, 3, 3, 5.9, 5.9, 5.9, 8.9 x5), and random chl values.
## with strata = flow, splits 12/30, rf_test has 1 guano present, 50/50 light split, even flow splits (3x 0, 3, 4x 5.9 and 2x 8.9), and more even chl values.
## play with prop = 0.8, 0.9, etc
rf_train <- training(rf_split) ##creates training and testing datasets
rf_test <- testing(rf_split)
## comparisons to test data using ROC and accuracy to measure performance
rf_fit2 <- ## fits random forest model to training dataset
rf_mod %>%
fit(ave.v ~ flow * chl * guano * light, data = rf_train)
rf_fit2
rf_pred2 <- predict(rf_fit2, rf_test) ## compares to test data
plot(rf_test$ave.v, rf_pred2$.pred, main = "corr - 0.2700745") ## observed vs predicted, can add cor value from cor.test below
cor2 <- cor.test(rf_test$ave.v, rf_pred2$.pred) ## gives correlation coef
cor2$estimate
### can run with different seeds and splits, strata = NULL
#### loop and run 10 times with diff seeds
## look at consistency of results
Models
##### Example models
library(tidymodels) # for the parsnip package, along with the rest of tidymodels
# Helper packages
library(readr) # for importing data
library(broom.mixed) # for converting bayesian models to tidy tibbles
library(dotwhisker) # for visualizing regression results
library(vip) # for variable importance plots
rm(list=ls(all=TRUE)) ## removes the previous workspace and environment so that we only have the data we need loaded in the session
load("~/Bigelow/Data/ParameterizationModel.15.07.24.Rdata")
seeing if this fixes error
dim(parameters)
glimpse(parameters)
dim(conditions)
glimpse(conditions)
df <- data.frame(conditions[,1],conditions[,2],conditions[,3],conditions[,4],parameters[,12])
colnames(df) <- c("flow","chl", "guano", "light",colnames(parameters)[12])
df
df<-na.omit(df)
source("notebook16-functions.R")
## fix to pull col name or number into model??
colnames(df) <- c("flow","chl", "guano", "light", colnames(parameters)[12])
c.v <- NULL
c.v_train <- NULL
p <- NULL
rsq2 <- NULL
mod1 <- NULL
mod2 <- NULL
mod3 <- NULL
node.purity <- data.frame(matrix (NA, nrow = 10, ncol = 5))
colnames(node.purity) <- c( "c.e", "c.e.t", "r.squared", "p-value", "node purity")
library(ggplot2)
library(GGally)
library(hexbin)
library(diptest)
library(randomForest)
library(reprtree)
## loop through different swimming parameters contain this loop within the bigger loop
colnames(parameters)
for (j in colnames(parameters)){
df <- data.frame(conditions[,1],conditions[,2],conditions[,3],conditions[,4], parameters[,j])
colnames(df) <- c("flow","chl", "guano", "light", j)
df
df<-na.omit(df)
source("notebook16-functions.R")
print(colnames(df))
for (i in 1:10){
c.e <- rf.skill(df = df, trees = 2000, col1 = colnames(df)[5])
c.v <- rbind(c.e, c.v)
c.e.t <- rf.skill.test(df = df, trees = 2000, col1 = colnames(df)[5], prop = 0.8, strata = NULL, do.plot = FALSE) ## training data
c.v_train <- rbind(c.e.t, c.v_train)
rsq1 <- rf.fit(df = df, trees = 2000, col1 = colnames(df)[5])
rsq2 <- rbind(rsq1, rsq2)
output.test <- model.test(df = df, trees = 2000, col1 = colnames(df)[5], prop = 0.8, strata = NULL, do.plot = TRUE)
## output node purity
## create function to filter edge effect out (.5 cm out, dist to edge)
##ggsave(filename=paste('~/Bigelow/Figures/tidymodels Output/ave v/', i, '.tiff', sep = ''), plot = p, width = 24 , height = 12)
}
mod1 <- rbind(mod1, c.v) ## add identifier for each parameter into data frame
mod2 <- rbind(mod2, c.v_train)
mod3 <- rbind(mod3, rsq2)
## input average of p value, ce, cv, node purity and rsq into each column and row in matrix for each parameter and model type
## rank on best fit
## table for report of stats on models
}
## take best model for each pararmeter into simulation
## try vel mean and sd, and turn mean and sd
rf_pred_train <- predict(rf_fit_train, rf_test) ## need to predict each parameter within simulation
## environmental input to swimming, eg. node purity
c.v <- as.data.frame(c.v)
c.v_train <- as.data.frame(c.v_train)
rsq2 <- as.data.frame(rsq2)
node.purity$c.e <- c.v$V1
node.purity$c.e.t <- c.v_train$cor
node.purity$r.squared <- rsq2$V1
Node Purity heat map
### to save each plot as an individual graph
#jpeg(filename= paste('~/Bigelow/Figures/tidymodels Output/fit', i,'.jpeg', sep = ''), width = 960, height = 780)
#plot(df$response, rf_pred$.pred, main = paste("corr = ", cor$estimate)) ## observed vs predicted
#dev.off()
#best_tree <- rf.skill.test(df = df, trees = 2000, "ave.v", prop = 0.5, strata = NULL, do.plot = TRUE) ## not working....
#bt <- rbind(best_tree, bt)
## look at things outside random forest
rf_training_pred <-
predict(rf_fit, rf_train) %>%
bind_cols(predict(rf_fit, rf_train, type = "numeric")) %>%
# Add the true outcome data back in
bind_cols(rf_train %>%
select(flow, chl, guano, light))
rf_training_pred$light <- as.factor(rf_training_pred$light)
rf_training_pred$guano <- as.factor(rf_training_pred$guano)
rf_training_pred %>% # training set predictions
roc_auc(truth = light, .pred...1)
rf_training_pred %>% # training set predictions, only works for factors
accuracy(truth = ave.v, .pred...2)
## now that the model has exceptional performance lets move to the test dataset
rf_testing_pred <-
predict(rf_fit, rf_test) %>%
bind_cols(predict(rf_fit, rf_test, type = "numeric")) %>%
bind_cols(rf_test %>% select(flow, chl, guano, light))
rf_testing_pred$light <- as.factor(rf_testing_pred$light)
rf_testing_pred$guano <- as.factor(rf_testing_pred$guano)
rf_testing_pred %>% # test set predictions
roc_auc(truth = light, .pred...1)
rf_testing_pred %>% # test set predictions
accuracy(truth = light, guano)
## differences caused by training set error (bias) by model
###### resampling to the rescue
set.seed(345)
folds <- vfold_cv(rf_train, v = 10)
folds
rf_wf <- ## bundles workflow and random forest model together without a recipe needed
workflow() %>%
add_model(rf_mod) %>%
add_formula(ave.v ~ .)
set.seed(456)
rf_fit_rs <-
rf_wf %>%
fit_resamples(folds) ##fits resamples
rf_fit_rs ## .metrics column contains metrics on model performance
collect_metrics(rf_fit_rs) ##manually unnests meterics data
rf_testing_pred %>% # test set predictions (AS ABOVE)
roc_auc(truth = light, .pred...1)
rf_testing_pred %>% # test set predictions (AS ABOVE)
accuracy(truth = light, guano)
Tuning the model
library(glmnet)
library(rpart.plot) # for visualizing a decision tree
library(vip) # for variable importance plots
tune_spec <-
decision_tree( ## this is the type of model
cost_complexity = tune(),
tree_depth = tune()
) %>%
set_engine("rpart") %>%
set_mode("regression")
tune_spec
tree_grid <- grid_regular(cost_complexity(),
tree_depth(),
levels = 5)
tree_grid
tree_grid %>% ## shows each level we will tune the model at
count(tree_depth)
set.seed(234) ## don't understand what these do??
rf_folds <- vfold_cv(rf_train) ## creates cross-validation folds for tuning
set.seed(345)
tree_wf <- workflow() %>% ##creates the workflow
add_model(tune_spec) %>%
add_formula(ave.v ~ .)
tree_res <- ## resamples and tunes model
tree_wf %>%
tune_grid(
resamples = rf_folds,
grid = tree_grid
)
tree_res ## gives tuning results
tree_res %>%
collect_metrics() ## collects metrics from tuned models
tree_res %>%
collect_metrics() %>%
mutate(tree_depth = factor(tree_depth)) %>%
ggplot(aes(cost_complexity, mean, color = tree_depth)) +
geom_line(size = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
## stubbiest tree with a depth of 1 performed the worst
## deepest tree with depth of 15 did better
tree_res %>%
show_best(metric = "rmse") ## shows best model fit
best_tree <- tree_res %>% ## pulls out data on the best fit
select_best(metric = "rmse")
best_tree ## summary of best tree model
final_wf <- ## create workflow from best tree model after tuning
tree_wf %>%
finalize_workflow(best_tree)
final_wf
final_fit <- ## create final model from new fit
final_wf %>%
last_fit(rf_split)
final_fit %>%
collect_metrics()
final_fit %>% ## plot ROC and compare performance after tuning
collect_predictions() %>%
roc_curve(flow, ave.v) %>% ### NOT WORKING
autoplot()
final_tree <- extract_workflow(final_fit) ## extract our final fit for future use
final_tree
final_tree %>% ## creates workflow plot
extract_fit_engine() %>%
rpart.plot(roundint = FALSE)
final_tree %>% ## shows which variables are most important to the model in a plot
extract_fit_parsnip() %>%
vip()
args(decision_tree)
Bigger RF Model
cores <- parallel::detectCores() ## sees how many cores we have to process the data
cores
rf_mod <- ## random forest model generation, parallel processing of models
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger", num.threads = cores) %>%
set_mode("regression")
rf_recipe <- ## create random forest model recipe
recipe(ave.v ~ ., data = df)
rf_workflow <- ## create random forest model workflow
workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_recipe)
rf_mod
val_set <- validation_split(df,
strata = flow,
prop = 0.80)
val_set
# show what will be tuned
extract_parameter_set_dials(rf_mod)
set.seed(345)
rf_res <-
rf_workflow %>%
tune_grid(val_set,
grid = 25,
control = control_grid(save_pred = TRUE),
metrics = metric_set(rmse))
rf_res %>% ## shows 5 best random forest models out of the 25 candidates
show_best(metric = "rmse")
autoplot(rf_res) ## plot results
rf_best <- ## creates model with best predictors
rf_res %>%
select_best(metric = "rmse")
rf_best
rf_res %>% ## collects data for ROC curve plot
collect_predictions()
rf_best$mtry <- as.integer(rf_best$mtry)
##NOT WORKING
rf_auc <- ## creates set of models with best model and model model for comparison
rf_res %>%
collect_predictions(parameters = rf_best) %>%
roc_curve(mtry, .pred) %>%
mutate(model = "Random Forest")
##NOT WORKING
bind_rows(rf_best, rf_res) %>% ## plots model comparisons on ROC curve
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)
################ last model after tuning
# the last model
last_rf_mod <-
rand_forest(mtry = 8, min_n = 7, trees = 1000) %>%
set_engine("ranger", num.threads = cores, importance = "impurity") %>%
set_mode("regression")
# the last workflow
last_rf_workflow <-
rf_workflow %>%
update_model(last_rf_mod)
# the last fit
set.seed(345)
last_rf_fit <-
last_rf_workflow %>%
last_fit(rf_split)
last_rf_fit
last_rf_fit %>% ## collect metrics from final model
collect_metrics()
last_rf_fit %>% ## updates model fit
extract_fit_parsnip() %>%
vip(num_features = 20)
##NOT WORKING
last_rf_fit %>% ## plots best ROC curve, with best set of hyperparameters as predictors
collect_predictions() %>%
roc_curve(ave.v, .pred...1) %>%
autoplot()
---
title: "notebook16-tidymodels"
output: html_notebook
---
Attempting to use tidymodels for processing our IBM krill data and selecting variables of importance for each swimming parameter

```{r}
library(tidymodels)  # for the parsnip package, along with the rest of tidymodels

# Helper packages
library(readr)       # for importing data
library(broom.mixed) # for converting bayesian models to tidy tibbles
library(dotwhisker)  # for visualizing regression results
library(vip)         # for variable importance plots


rm(list=ls(all=TRUE))   ## removes the previous workspace and environment so that we only have the data we need loaded in the session
load("~/Post-doc/krill-tank-code/DataProcessing/Notebooks/ParameterizationModel.Rdata")


dim(parameters)
glimpse(parameters)

dim(conditions)
glimpse(conditions)

df <- data.frame(conditions[,1],conditions[,2],conditions[,3],conditions[,4],parameters[,1], parameters[,7])
colnames(df) <- c("flow","chl", "guano", "light","ave.v", "dip.test")
df
df<-na.omit(df)


ave.v ~ flow * chl * guano * light


linear_reg()

linear_reg() %>% 
  set_engine("keras")

lm_mod <- linear_reg()

lm_fit <- 
  lm_mod %>% 
  fit(ave.v ~ flow * chl * guano * light, data = df)  ## creates a linear model (LM)
lm_fit

tidy(lm_fit)  ## summary of model


tidy(lm_fit) %>% 
  dwplot(dot_args = list(size = 2, color = "black"),
         whisker_args = list(color = "black"),
         vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2))

new_points <- expand.grid(flow = 6.1,
                          chl = 19,
                          guano = 0,
                          light = 0) ## create new points
new_points

mean_pred <- predict(lm_fit, new_data = new_points)  ## mean predicted body width
mean_pred

conf_int_pred <- predict(lm_fit, 
                         new_data = new_points, 
                         type = "conf_int")  ## confidence interval prediction
conf_int_pred

plot_data <- 
  new_points %>% 
  bind_cols(mean_pred) %>% 
  bind_cols(conf_int_pred)

# and plot:
ggplot(plot_data, aes(x = flow)) + 
  geom_point(aes(y = .pred)) + 
  geom_errorbar(aes(ymin = .pred_lower, 
                    ymax = .pred_upper),
                width = .2) + 
  labs(y = "ave v")

# set the prior distribution
prior_dist <- rstanarm::student_t(df = 1)

set.seed(123)

# make the parsnip model
bayes_mod <-   
  linear_reg() %>% 
  set_engine("stan", 
             prior_intercept = prior_dist, 
             prior = prior_dist) 

# train the model
bayes_fit <- 
  bayes_mod %>% 
  fit(ave.v ~ flow * chl * guano * light, data = df)

print(bayes_fit, digits = 5)

tidy(bayes_fit, conf.int = TRUE)

bayes_plot_data <- 
  new_points %>% 
  bind_cols(predict(bayes_fit, new_data = new_points)) %>% 
  bind_cols(predict(bayes_fit, new_data = new_points, type = "conf_int"))

ggplot(bayes_plot_data, aes(x = flow)) + 
  geom_point(aes(y = .pred)) + 
  geom_errorbar(aes(ymin = .pred_lower, ymax = .pred_upper), width = .2) + 
  labs(y = "ave.v") + 
  ggtitle("Bayesian model with t(1) prior distribution")

### Tuning the model in sections

df %>% 
  group_by(light) %>% 
  summarize(med_flow = median(flow))

bayes_mod %>% 
  fit(ave.v ~ flow * chl * guano * light, data = df)

ggplot(df,
       aes(flow, ave.v)) +      # returns a ggplot object 
  geom_jitter() +                         # same
  geom_smooth(method = lm, se = FALSE) +  # same                    
  labs(x = "flow", y = "ave.v")         # etc
```
Inputs

```{r}

rf_mod <- ## creates random forest model
  rand_forest(trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")


set.seed(234)  ## comment out to get random runs

rf_fit <- ## fits random forest model to whole dataset
  rf_mod %>% 
  fit(ave.v ~ flow * chl * guano * light, data = df)
rf_fit

rf_pred <- predict(rf_fit, df)  
plot(df$ave.v, rf_pred$.pred, main = "corr - 0.7294325")  ## observed vs predicted
cor <- cor.test(df$ave.v, rf_pred$.pred)  ## gives correlation coef

rf_split <- initial_split(df %>% select(flow, chl, guano, light, ave.v), ## splits cases base on initial results
                            strata = NULL)  ## with strata = NULL splits 11/31, rf_test has all guano absent, 50/50 light split, uneven flow split (0, 3, 3, 5.9, 5.9, 5.9, 8.9 x5), and random chl values.
                                            ## with strata = flow, splits 12/30, rf_test has 1 guano present, 50/50 light split, even flow splits (3x 0, 3, 4x 5.9 and 2x 8.9), and more even chl values.
## play with prop = 0.8, 0.9, etc


rf_train <- training(rf_split)  ##creates training and testing datasets
rf_test  <- testing(rf_split)

## comparisons to test data using ROC and accuracy to measure performance

rf_fit2 <- ## fits random forest model to training dataset
  rf_mod %>% 
  fit(ave.v ~ flow * chl * guano * light, data = rf_train)
rf_fit2

rf_pred2 <- predict(rf_fit2, rf_test) ## compares to test data
plot(rf_test$ave.v, rf_pred2$.pred, main = "corr - 0.2700745")  ## observed vs predicted, can add cor value from cor.test below
cor2 <- cor.test(rf_test$ave.v, rf_pred2$.pred)  ## gives correlation coef
cor2$estimate
### can run with different seeds and splits, strata = NULL
#### loop and run 10 times with diff seeds
## look at consistency of results
```

Models
```{r}
##### Example models
library(tidymodels)  # for the parsnip package, along with the rest of tidymodels

# Helper packages
library(readr)       # for importing data
library(broom.mixed) # for converting bayesian models to tidy tibbles
library(dotwhisker)  # for visualizing regression results
library(vip)         # for variable importance plots


rm(list=ls(all=TRUE))   ## removes the previous workspace and environment so that we only have the data we need loaded in the session
load("~/Bigelow/Data/ParameterizationModel.15.07.24.Rdata")
```

seeing if this fixes error
```{r}
dim(parameters)
glimpse(parameters)

dim(conditions)
glimpse(conditions)

df <- data.frame(conditions[,1],conditions[,2],conditions[,3],conditions[,4],parameters[,12])
colnames(df) <- c("flow","chl", "guano", "light",colnames(parameters)[12])
df
df<-na.omit(df)

source("notebook16-functions.R")
## fix to pull col name or number into model??
colnames(df) <- c("flow","chl", "guano", "light", colnames(parameters)[12])
c.v <- NULL
c.v_train <- NULL
p <- NULL
rsq2 <- NULL
mod1 <- NULL
mod2 <- NULL
mod3 <- NULL
node.purity <- data.frame(matrix (NA, nrow = 10, ncol = 5))
colnames(node.purity) <- c( "c.e", "c.e.t", "r.squared", "p-value", "node purity")


library(ggplot2)
library(GGally)
library(hexbin)
library(diptest)
library(randomForest)
library(reprtree)

## loop through different swimming parameters contain this loop within the bigger loop
colnames(parameters)


for (j in colnames(parameters)){
    df <- data.frame(conditions[,1],conditions[,2],conditions[,3],conditions[,4], parameters[,j])
    colnames(df) <- c("flow","chl", "guano", "light", j)
    df
    df<-na.omit(df)
    source("notebook16-functions.R")
    print(colnames(df))

      for (i in 1:10){  

        c.e <- rf.skill(df = df, trees = 2000, col1 = colnames(df)[5]) 
        c.v <- rbind(c.e, c.v)  
        c.e.t <- rf.skill.test(df = df, trees = 2000, col1 = colnames(df)[5], prop = 0.8, strata = NULL, do.plot = FALSE)   ## training data
        c.v_train <- rbind(c.e.t, c.v_train)
        rsq1 <- rf.fit(df = df, trees = 2000, col1 = colnames(df)[5])
        rsq2 <- rbind(rsq1, rsq2)
  
        output.test <- model.test(df = df, trees = 2000, col1 = colnames(df)[5], prop = 0.8, strata = NULL, do.plot = TRUE)
        
        ## output node purity
  
        ## create function to filter edge effect out (.5 cm out, dist to edge)
  
        ##ggsave(filename=paste('~/Bigelow/Figures/tidymodels Output/ave v/', i, '.tiff', sep = ''), plot = p,     width = 24 , height = 12)
      }
    mod1 <- rbind(mod1, c.v)  ## add identifier for each parameter into data frame
    mod2 <- rbind(mod2, c.v_train)
    mod3 <- rbind(mod3, rsq2)
    
    ## input average of p value, ce, cv, node purity and rsq into each column and row in matrix for each parameter and model type
    
    ## rank on best fit
    ## table for report of stats on models
    
}

## take best model for each pararmeter into simulation
## try vel mean and sd, and turn mean and sd 

rf_pred_train <- predict(rf_fit_train, rf_test) ## need to predict each parameter within simulation

## environmental input to swimming, eg. node purity



c.v <- as.data.frame(c.v)
c.v_train <- as.data.frame(c.v_train)
rsq2 <- as.data.frame(rsq2)

node.purity$c.e <- c.v$V1
node.purity$c.e.t <- c.v_train$cor
node.purity$r.squared <- rsq2$V1

```

Node Purity heat map
```{r}
##### Example data
set.seed(123)                                                     # Set seed for reproducibility
data<- matrix(rnorm(100, 0, 10), nrow = 10, ncol = 10)           # Create example data                                                   # Apply heatmap function

colnames(data)<- paste0("col", 1:10)                             # Column names
rownames(data)<- paste0("row", 1:10)                             # Row names
head(data,5)

node <- read.csv("C:\\Users\\Nicole Hellessey\\Documents\\Bigelow\\Data\\Node Purity Values - Overall Predictors.csv", header = T)
head(node)
node2 <- data.matrix(node)

#####node##### Example 1
heatmap(node2)  

##### Example 2
heatmap(node2, Rowv = NA, Colv = NA)                               # Remove dendogram

##### Example 3
my_colors<- colorRampPalette(c("cyan", "deeppink3"))             # Manual color range
heatmap(node2, col = my_colors(100))                               # Heatmap with manual colors

##### Example 4                                # Install reshape package
library(reshape)                                                # Load reshape package

node_melt <- melt(node)                                           # Reorder data
library(ggplot2)                                                # Load ggplot2 package

ggp <- ggplot(node_melt, aes(Factor, variable)) +                           # Create heatmap with ggplot2
  geom_tile(aes(fill = value))
ggp                                                               # Print heatmap

#########################################################
library(ggplot2)
library(reshape2)

attach(node_melt)

ggplot(node_melt, aes(x = Factor, y = variable, fill = value)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  labs(x = "Environmental Condition", y = "Swimming Parameter", title = "Node Purity Heatmap - Overall Predictors")

#########################################################
# load required packages
library(plotly)
library(reshape2)

# create heatmap using plotly
plot_ly(node_melt, x = Factor, y = variable, z = value, type = "heatmap") %>%
  layout(title = "Heatmap", xaxis = list(title = "Column"), yaxis = list(title = "Row"))
```


```{r}
  ### to save each plot as an individual graph
  #jpeg(filename= paste('~/Bigelow/Figures/tidymodels Output/fit', i,'.jpeg', sep = ''), width = 960,      height = 780)
  #plot(df$response, rf_pred$.pred, main = paste("corr = ", cor$estimate))  ## observed vs predicted
  #dev.off()


  #best_tree <- rf.skill.test(df = df, trees = 2000, "ave.v", prop = 0.5, strata = NULL, do.plot = TRUE) ## not working....
  #bt <- rbind(best_tree, bt)

## look at things outside random forest


rf_training_pred <- 
  predict(rf_fit, rf_train) %>% 
  bind_cols(predict(rf_fit, rf_train, type = "numeric")) %>% 
  # Add the true outcome data back in
  bind_cols(rf_train %>% 
              select(flow, chl, guano, light))

rf_training_pred$light <- as.factor(rf_training_pred$light)
rf_training_pred$guano <- as.factor(rf_training_pred$guano)

rf_training_pred %>%                # training set predictions
  roc_auc(truth = light, .pred...1)

rf_training_pred %>%                # training set predictions, only works for factors
  accuracy(truth = ave.v, .pred...2)

## now that the model has exceptional performance lets move to the test dataset

rf_testing_pred <- 
  predict(rf_fit, rf_test) %>% 
  bind_cols(predict(rf_fit, rf_test, type = "numeric")) %>% 
  bind_cols(rf_test %>% select(flow, chl, guano, light))

rf_testing_pred$light <- as.factor(rf_testing_pred$light)
rf_testing_pred$guano <- as.factor(rf_testing_pred$guano)

rf_testing_pred %>%                   # test set predictions
  roc_auc(truth = light, .pred...1)

rf_testing_pred %>%                   # test set predictions
  accuracy(truth = light, guano)

## differences caused by training set error (bias) by model

###### resampling to the rescue

set.seed(345)
folds <- vfold_cv(rf_train, v = 10)
folds

rf_wf <- ## bundles workflow and random forest model together without a recipe needed
  workflow() %>%
  add_model(rf_mod) %>%
  add_formula(ave.v ~ .)

set.seed(456)
rf_fit_rs <- 
  rf_wf %>% 
  fit_resamples(folds)  ##fits resamples

rf_fit_rs  ## .metrics column contains metrics on model performance

collect_metrics(rf_fit_rs)  ##manually unnests meterics data

rf_testing_pred %>%                   # test set predictions (AS ABOVE)
  roc_auc(truth = light, .pred...1)

rf_testing_pred %>%                   # test set predictions  (AS ABOVE)
  accuracy(truth = light, guano)
```
Tuning the model
```{r}
library(glmnet)
library(rpart.plot)  # for visualizing a decision tree
library(vip)         # for variable importance plots

tune_spec <- 
  decision_tree(  ## this is the type of model
    cost_complexity = tune(),
    tree_depth = tune()
  ) %>% 
  set_engine("rpart") %>% 
  set_mode("regression")

tune_spec

tree_grid <- grid_regular(cost_complexity(),
                          tree_depth(),
                          levels = 5)

tree_grid

tree_grid %>% ## shows each level we will tune the model at
  count(tree_depth)

set.seed(234) ## don't understand what these do??
rf_folds <- vfold_cv(rf_train)  ## creates cross-validation folds for tuning

set.seed(345)

tree_wf <- workflow() %>%  ##creates the workflow
  add_model(tune_spec) %>%
  add_formula(ave.v ~ .)

tree_res <- ## resamples and tunes model
  tree_wf %>% 
  tune_grid(
    resamples = rf_folds,
    grid = tree_grid
    )

tree_res  ## gives tuning results

tree_res %>% 
  collect_metrics()  ## collects metrics from tuned models


tree_res %>%
  collect_metrics() %>%
  mutate(tree_depth = factor(tree_depth)) %>%
  ggplot(aes(cost_complexity, mean, color = tree_depth)) +
  geom_line(size = 1.5, alpha = 0.6) +
  geom_point(size = 2) +
  facet_wrap(~ .metric, scales = "free", nrow = 2) +
  scale_x_log10(labels = scales::label_number()) +
  scale_color_viridis_d(option = "plasma", begin = .9, end = 0)

## stubbiest tree with a depth of 1 performed the worst
## deepest tree with depth of 15 did better

tree_res %>%
  show_best(metric = "rmse")  ## shows best model fit

best_tree <- tree_res %>%  ## pulls out data on the best fit
  select_best(metric = "rmse")

best_tree ## summary of best tree model

final_wf <- ## create workflow from best tree model after tuning
  tree_wf %>% 
  finalize_workflow(best_tree)

final_wf

final_fit <- ## create final model from new fit
  final_wf %>%
  last_fit(rf_split) 

final_fit %>%
  collect_metrics()


final_fit %>%  ## plot ROC and compare performance after tuning
  collect_predictions() %>% 
  roc_curve(flow, ave.v) %>% ### NOT WORKING
  autoplot()

final_tree <- extract_workflow(final_fit)  ## extract our final fit for future use
final_tree

final_tree %>%  ## creates workflow plot
  extract_fit_engine() %>%
  rpart.plot(roundint = FALSE)

final_tree %>% ## shows which variables are most important to the model in a plot
  extract_fit_parsnip() %>% 
  vip()

args(decision_tree)


```
Bigger RF Model
```{r}
cores <- parallel::detectCores() ## sees how many cores we have to process the data
cores

rf_mod <- ## random forest model generation, parallel processing of models
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", num.threads = cores) %>% 
  set_mode("regression")

rf_recipe <- ## create random forest model recipe
  recipe(ave.v ~ ., data = df) 
  
rf_workflow <- ## create random forest model workflow
  workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(rf_recipe)

rf_mod

val_set <- validation_split(df, 
                            strata = flow, 
                            prop = 0.80)
val_set
# show what will be tuned
extract_parameter_set_dials(rf_mod)

set.seed(345)
rf_res <- 
  rf_workflow %>% 
  tune_grid(val_set,
            grid = 25,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))

rf_res %>% ## shows 5 best random forest models out of the 25 candidates
  show_best(metric = "rmse")

autoplot(rf_res)  ## plot results

```

```{r}
rf_best <- ## creates model with best predictors
  rf_res %>% 
  select_best(metric = "rmse")
rf_best

rf_res %>% ## collects data for ROC curve plot
  collect_predictions()

rf_best$mtry <- as.integer(rf_best$mtry)


##NOT WORKING
rf_auc <- ## creates set of models with best model and model model for comparison
  rf_res %>% 
  collect_predictions(parameters = rf_best) %>% 
  roc_curve(mtry, .pred) %>% 
  mutate(model = "Random Forest")  

##NOT WORKING
bind_rows(rf_best, rf_res) %>% ## plots model comparisons on ROC curve
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + 
  geom_path(lwd = 1.5, alpha = 0.8) +
  geom_abline(lty = 3) + 
  coord_equal() + 
  scale_color_viridis_d(option = "plasma", end = .6)
```

```{r}
################ last model after tuning 

# the last model
last_rf_mod <- 
  rand_forest(mtry = 8, min_n = 7, trees = 1000) %>% 
  set_engine("ranger", num.threads = cores, importance = "impurity") %>% 
  set_mode("regression")

# the last workflow
last_rf_workflow <- 
  rf_workflow %>% 
  update_model(last_rf_mod)

# the last fit
set.seed(345)
last_rf_fit <- 
  last_rf_workflow %>% 
  last_fit(rf_split)

last_rf_fit

last_rf_fit %>%  ## collect metrics from final model
  collect_metrics()

last_rf_fit %>% ## updates model fit
  extract_fit_parsnip() %>% 
  vip(num_features = 20)

##NOT WORKING
last_rf_fit %>% ## plots best ROC curve, with best set of hyperparameters as predictors
  collect_predictions() %>% 
  roc_curve(ave.v, .pred...1) %>% 
  autoplot()

```

